home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
The Atari Compendium
/
The Atari Compendium (Toad Computers) (1994).iso
/
files
/
umich
/
utils
/
strsed.zoo
/
regex.c
< prev
next >
Wrap
C/C++ Source or Header
|
1992-06-17
|
47KB
|
1,771 lines
/* Extended regular expression matching and search library.
Copyright (C) 1985, 1989 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 1, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
In other words, you are welcome to use, share and improve this program.
You are forbidden to forbid anyone else to use, share and improve
what you give them. Help stamp out software-hoarding! */
/* To test, compile with -Dtest.
This Dtestable feature turns this into a self-contained program
which reads a pattern, describes how it compiles,
then reads a string and searches for it. */
/* AIX requires this to be the first thing in the file. */
#ifdef __GNUC__
#undef alloca
#define alloca __builtin_alloca
#else /* not __GNUC__ */
#if defined(sparc) && !defined(USG) && !defined(SVR4) && !defined(__svr4__)
#include <alloca.h>
#else
#ifdef _AIX
#pragma alloca
#else
char *alloca ();
#endif
#endif /* sparc */
#endif /* not __GNUC__ */
#ifdef emacs
/* The `emacs' switch turns on certain special matching commands
that make sense only in emacs. */
#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
#else /* not emacs */
#include <stdio.h> /*WJR*/
#if defined(USG) || defined(STDC_HEADERS)
#include <string.h>
#ifndef bcopy
#define bcopy(s,d,n) memcpy((d),(s),(n))
#endif
#ifndef bcmp
#define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
#endif
#ifndef bzero
#define bzero(s,n) memset((s),0,(n))
#endif
#endif
#ifdef STDC_HEADERS
#include <stdlib.h>
#else
char *malloc ();
#endif
/*
* Define the syntax stuff, so we can do the \<...\> things.
*/
#ifndef Sword /* must be non-zero in some of the tests below... */
#define Sword 1
#endif
#define SYNTAX(c) re_syntax_table[c]
#ifdef SYNTAX_TABLE
char *re_syntax_table;
#else
static char re_syntax_table[256];
static void
init_syntax_once ()
{
register int c;
static int done = 0;
if (done)
return;
bzero (re_syntax_table, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
for (c = 'A'; c <= 'Z'; c++)
re_syntax_table[c] = Sword;
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
done = 1;
}
#endif /* SYNTAX_TABLE */
#endif /* not emacs */
#include "regex.h"
/* Number of failure points to allocate space for initially,
when matching. If this number is exceeded, more space is allocated,
so it is not a hard limit. */
#ifndef NFAILURES
#define NFAILURES 80
#endif /* NFAILURES */
/* width of a byte in bits */
#define BYTEWIDTH 8
#ifndef SIGN_EXTEND_CHAR
#ifdef __CHAR_UNSIGNED__
#define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c))
#else
#define SIGN_EXTEND_CHAR(x) (x)
#endif
#endif
static int obscure_syntax = 0;
/* Specify the precise syntax of regexp for compilation.
This provides for compatibility for various utilities
which historically have different, incompatible syntaxes.
The argument SYNTAX is a bit-mask containing the two bits
RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
int
re_set_syntax (syntax)
int syntax;
{
int ret;
ret = obscure_syntax;
obscure_syntax = syntax;
return ret;
}
/* re_compile_pattern takes a regular-expression string
and converts it into a buffer full of byte commands for matching.
PATTERN is the address of the pattern string
SIZE is the length of it.
BUFP is a struct re_pattern_buffer * which points to the info
on where to store the byte commands.
This structure contains a char * which points to the
actual space, which should have been obtained with malloc.
re_compile_pattern may use realloc to grow the buffer space.
The number of bytes of commands can be found out by looking in
the struct re_pattern_buffer that bufp pointed to,
after re_compile_pattern returns.
*/
#define PATPUSH(ch) (*b++ = (char) (ch))
#define PATFETCH(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; \
if (translate) c = translate[c]; }
#define PATFETCH_RAW(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; }
#define PATUNFETCH p--
#define EXTEND_BUFFER \
{ char *old_buffer = bufp->buffer; \
if (bufp->allocated == (1<<16)) goto too_big; \
bufp->allocated *= 2; \
if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
goto memory_exhausted; \
c = bufp->buffer - old_buffer; \
b += c; \
if (fixup_jump) \
fixup_jump += c; \
if (laststart) \
laststart += c; \
begalt += c; \
if (pending_exact) \
pending_exact += c; \
}
static void store_jump (), insert_jump ();
char *
re_compile_pattern (pattern, size, bufp)
char *pattern;
int size;
struct re_pattern_buffer *bufp;
{
register char *b = bufp->buffer;
register char *p = pattern;
char *pend = pattern + size;
register unsigned c, c1;
char *p1;
unsigned char *translate = (unsigned char *) bufp->translate;
/* address of the count-byte of the most recently inserted "exactn" command.
This makes it possible to tell whether a new exact-match character
can be added to that command or requires a new "exactn" command. */
char *pending_exact = 0;
/* address of the place where a forward-jump should go
to the end of the containing expression.
Each alternative of an "or", except the last, ends with a forward-jump
of this sort. */
char *fixup_jump = 0;
/* address of start of the most recently finished expression.
This tells postfix * where to find the start of its operand. */
char *laststart = 0;
/* In processing a repeat, 1 means zero matches is allowed */
char zero_times_ok;
/* In processing a repeat, 1 means many matches is allowed */
char many_times_ok;
/* address of beginning of regexp, or inside of last \( */
char *begalt = b;
/* Stack of information saved by \( and restored by \).
Four stack elements are pushed by each \(:
First, the value of b.
Second, the value of fixup_jump.
Third, the value of regnum.
Fourth, the value of begalt. */
int stackb[40];
int *stackp = stackb;
int *stacke = stackb + 40;
int *stackt;
/* Counts \('s as they are encountered. Remembered for the matching \),
where it becomes the "register number" to put in the stop_memory command */
int regnum = 1;
bufp->fastmap_accurate = 0;
#ifndef emacs
#ifndef SYNTAX_TABLE
/*
* Initialize the syntax table.
*/
init_syntax_once();
#endif
#endif
if (bufp->allocated == 0)
{
bufp->allocated = 28;
if (bufp->buffer)
/* EXTEND_BUFFER loses when bufp->allocated is 0 */
bufp->buffer = (char *) realloc (bufp->buffer, 28);
else
/* Caller did not allocate a buffer. Do it for him */
bufp->buffer = (char *) malloc (28);
if (!bufp->buffer) goto memory_exhausted;
begalt = b = bufp->buffer;
}
while (p != pend)
{
if (b - bufp->buffer > bufp->allocated - 10)
/* Note that EXTEND_BUFFER clobbers c */
EXTEND_BUFFER;
PATFETCH (c);
switch (c)
{
case '$':
if (obscure_syntax & RE_TIGHT_VBAR)
{
if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
goto normal_char;
/*